raw_features <- c("DF", "CF", "BF", "AF", "AN")
for (var in c("label", raw_features)) {
cat(sprintf("\n\n### %s\n\n", var))
plt <- plot_cloud_data(cloud_data, var)
vthemes::subchunkify(
plt, i = subchunk_idx, fig_width = 10, fig_height = 4
)
subchunk_idx <- subchunk_idx + 1
}for (image_id in unique(cloud_data$image)) {
plt_df <- cloud_data |>
dplyr::filter(image == !!image_id)
plt <- plot_pairs(
plt_df, columns = c("DF", "CF", "BF", "AF", "AN"),
point_size = 0.1, subsample = 0.5, color = plt_df$label
) +
ggplot2::scale_color_manual(values = cloud_colors) +
ggplot2::scale_fill_manual(values = cloud_colors) +
ggplot2::labs(title = sprintf("Image %s", image_id))
vthemes::subchunkify(
plt, i = subchunk_idx, fig_width = 10, fig_height = 10
)
subchunk_idx <- subchunk_idx + 1
}engineered_features <- c("NDAI", "SD", "CORR")
for (var in c("label", engineered_features)) {
cat(sprintf("\n\n### %s\n\n", var))
plt <- plot_cloud_data(cloud_data, var)
vthemes::subchunkify(
plt, i = subchunk_idx, fig_width = 10, fig_height = 4
)
subchunk_idx <- subchunk_idx + 1
}To mimic the process of how we obtain our future data (in this case, we expect to obtain completely new images), we leave out one full image for the test set. For cross-validation and the training-validation splits, we also perform clustered sampling, where we partition the images into contiguous blocks to maintain the integrity of the image. Below, we show the image blocks used in the data splitting scheme.
# divide image into contiguous chunks
cloud_data <- add_cloud_blocks(cloud_data_ls)
plt <- plot_cloud_data(cloud_data, var = "block_id")
plt# save one image for testing
test_image_idx <- sample(unique(cloud_data$image), 1)
train_data_all <- cloud_data |>
dplyr::filter(!(image %in% test_image_idx))
test_data_all <- cloud_data |>
dplyr::filter(image %in% test_image_idx)
print(sprintf("Test image: %s", test_image_idx))## [1] "Test image: 1"
We next use the aforementioned data splitting scheme to both tune model hyperparameters and select the best model from the following candidates:
Models under consideration:
Note that within glmnet::cv.glmnet, there is an interior cross-validation loop to tune the hyperparameters for LASSO and ridge, and by explicitly setting the foldid, we ensure that the cross-validation is done using our clustered sampling scheme by image block. If the R function doesn’t have a built-in CV option, we would need to code this up ourselves (or using other functions like from the caret R package). Within glmnet::cv.glmnet, there is also a re-fitting step, where the best hyperparameters are used to fit the model on the full training set.
keep_vars <- c("DF", "CF", "BF", "AF", "AN", "binary_label")
train_data <- cloud_data |>
dplyr::filter(!(image %in% test_image_idx))
test_data <- cloud_data |>
dplyr::filter(image %in% test_image_idx)
# evaluate validation error for various models
valid_auroc_ls <- list()
valid_auprc_ls <- list()
valid_preds_ls <- list()
for (fold in unique(train_data_all$block_id)) {
# do data split
cv_train_data_all <- train_data_all |>
dplyr::filter(block_id != !!fold)
cv_train_data <- cv_train_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
cv_valid_data_all <- train_data_all |>
dplyr::filter(block_id == !!fold)
cv_valid_data <- cv_valid_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
# fit logistic regression
log_fit <- glm(
binary_label ~ ., data = cv_train_data, family = "binomial"
)
log_preds <- predict(log_fit, cv_valid_data, type = "response")
# fit and evaluate lasso regression
lasso_fit <- glmnet::cv.glmnet(
x = as.matrix(cv_train_data |> dplyr::select(-binary_label)),
y = cv_train_data$binary_label,
family = "binomial",
alpha = 1,
foldid = as.numeric(as.factor(cv_train_data_all$block_id))
)
lasso_preds <- predict(
lasso_fit,
as.matrix(cv_valid_data |> dplyr::select(-binary_label)),
s = "lambda.min", # or "lambda.1se"
type = "response"
)
# fit and evaluate ridge regression
ridge_fit <- glmnet::cv.glmnet(
x = as.matrix(cv_train_data |> dplyr::select(-binary_label)),
y = cv_train_data$binary_label,
family = "binomial",
alpha = 0,
foldid = as.numeric(as.factor(cv_train_data_all$block_id))
)
ridge_preds <- predict(
ridge_fit,
as.matrix(cv_valid_data |> dplyr::select(-binary_label)),
s = "lambda.min", # or "lambda.1se"
type = "response"
)
# fit random forest
rf_fit <- ranger::ranger(
binary_label ~ ., data = cv_train_data,
probability = TRUE, verbose = FALSE
)
rf_preds <- predict(rf_fit, cv_valid_data)$predictions[, 2]
# evaluate predictions
preds_ls <- list(
"logistic" = log_preds,
"lasso" = lasso_preds,
"ridge" = ridge_preds,
"rf" = rf_preds
)
valid_auroc_ls[[fold]] <- purrr::map(
preds_ls,
~ yardstick::roc_auc_vec(
truth = cv_valid_data$binary_label,
estimate = c(.x),
event_level = "second"
)
) |>
dplyr::bind_rows(.id = "method")
valid_auprc_ls[[fold]] <- purrr::map(
preds_ls,
~ yardstick::pr_auc_vec(
truth = cv_valid_data$binary_label,
estimate = c(.x),
event_level = "second"
)
) |>
dplyr::bind_rows(.id = "method")
# save fold predictions for future investigation
valid_preds_ls[[fold]] <- cv_valid_data_all |>
dplyr::bind_cols(preds_ls)
}
# examine validation accuracy
valid_preds_ls1 <- valid_preds_ls
valid_auroc_df <- dplyr::bind_rows(valid_auroc_ls, .id = "fold")
mean_valid_auroc_df <- valid_auroc_df |>
dplyr::summarise(dplyr::across(-fold, ~ mean(.x, na.rm = TRUE)))
valid_auprc_df <- dplyr::bind_rows(valid_auprc_ls, .id = "fold")
mean_valid_auprc_df <- valid_auprc_df |>
dplyr::summarise(dplyr::across(-fold, ~ mean(.x, na.rm = TRUE)))
# evaluate best model on test set
train_data <- train_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
test_data <- test_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
best_fit <- ranger::ranger(
binary_label ~ ., data = train_data, probability = TRUE, verbose = FALSE
)
test_preds <- predict(best_fit, test_data)$predictions[, 2]
test_auroc <- yardstick::roc_auc_vec(
truth = test_data$binary_label,
estimate = test_preds,
event_level = "second"
)
test_auprc <- yardstick::pr_auc_vec(
truth = test_data$binary_label,
estimate = test_preds,
event_level = "second"
)
auroc_df <- mean_valid_auroc_df |>
dplyr::rename_with(~ sprintf("Validation %s AUROC", stringr::str_to_title(.x))) |>
dplyr::mutate(
`Test AUROC` = test_auroc
)
auprc_df <- mean_valid_auprc_df |>
dplyr::rename_with(~ sprintf("Validation %s AUPRC", stringr::str_to_title(.x))) |>
dplyr::mutate(
`Test AUPRC` = test_auprc
)
vthemes::pretty_kable(
valid_auroc_df, caption = "Fold-wise Validation AUROC for Various Models"
)| fold | logistic | lasso | ridge | rf |
|---|---|---|---|---|
| 6 | 0.885 | 0.885 | 0.886 | 0.881 |
| 5 | 0.791 | 0.798 | 0.811 | 0.713 |
| 7 | 0.817 | 0.814 | 0.815 | 0.851 |
| 8 | 0.907 | 0.908 | 0.912 | 0.928 |
| 10 | 0.378 | 0.378 | 0.382 | 0.493 |
| 9 | 0.681 | 0.681 | 0.678 | 0.578 |
| 11 | 0.130 | 0.129 | 0.124 | 0.602 |
| 12 | 0.652 | 0.653 | 0.655 | 0.843 |
| Validation Logistic AUROC | Validation Lasso AUROC | Validation Ridge AUROC | Validation Rf AUROC | Test AUROC |
|---|---|---|---|---|
| 0.655 | 0.656 | 0.658 | 0.736 | 0.758 |
| fold | logistic | lasso | ridge | rf |
|---|---|---|---|---|
| 6 | 0.759 | 0.760 | 0.762 | 0.777 |
| 5 | 0.953 | 0.955 | 0.959 | 0.925 |
| 7 | 0.289 | 0.286 | 0.284 | 0.359 |
| 8 | 0.372 | 0.373 | 0.380 | 0.485 |
| 10 | 0.321 | 0.321 | 0.323 | 0.388 |
| 9 | 0.311 | 0.311 | 0.306 | 0.266 |
| 11 | 0.0117 | 0.0117 | 0.0116 | 0.0328 |
| 12 | 0.153 | 0.153 | 0.158 | 0.340 |
| Validation Logistic AUPRC | Validation Lasso AUPRC | Validation Ridge AUPRC | Validation Rf AUPRC | Test AUPRC |
|---|---|---|---|---|
| 0.396 | 0.396 | 0.398 | 0.446 | 0.327 |
How are we doing? Are we doing well? Any concerns?
Let’s look at the held-out folds where the methods didn’t perform so well and compare the predictions across methods.
plot_vars <- c(
"binary_label", "logistic", "lasso", "ridge", "rf",
"DF", "CF", "BF", "AF", "AN"#, "NDAI", "SD", "CORR"
)
for (fold in unique(train_data_all$block_id)) {
cat(sprintf("\n\n#### Fold %s\n\n", fold))
preds_df <- valid_preds_ls[[fold]]
plt_ls <- list()
for (var in plot_vars) {
plt_ls[[var]] <- plot_cloud_data(preds_df, var)
}
plt <- patchwork::wrap_plots(plt_ls, ncol = 5)
vthemes::subchunkify(
plt, i = subchunk_idx, fig_width = 14, fig_height = 6
)
subchunk_idx <- subchunk_idx + 1
}# fold <- "8" # good fold
# fold <- "10" # "11" # bad fold
# preds_df <- valid_preds_ls1[[fold]]
# plot_vars <- c(
# # "label",
# "binary_label", "logistic", "lasso", "ridge", "rf",
# "DF", "CF", "BF", "AF", "AN"#, "NDAI", "SD", "CORR"
# )
# plt_ls <- list()
# for (var in plot_vars) {
# plt_ls[[var]] <- plot_cloud_data(preds_df, var)
# }
# plt <- patchwork::wrap_plots(plt_ls, ncol = 5)
# pltEngineering features based upon prior or domain knowledge can greatly improve the accuracy of our models. In Shi et al. (2008), the authors engineered three additional features:
Let’s repeat the previous analysis including these engineered features.
keep_vars <- c("NDAI", "SD", "CORR", keep_vars)
train_data <- cloud_data |>
dplyr::filter(!(image %in% test_image_idx))
test_data <- cloud_data |>
dplyr::filter(image %in% test_image_idx)
# evaluate validation error for various models
valid_auroc_ls <- list()
valid_auprc_ls <- list()
valid_preds_ls <- list()
for (fold in unique(train_data_all$block_id)) {
# do data split
cv_train_data_all <- train_data_all |>
dplyr::filter(block_id != !!fold)
cv_train_data <- cv_train_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
cv_valid_data_all <- train_data_all |>
dplyr::filter(block_id == !!fold)
cv_valid_data <- cv_valid_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
# fit logistic regression
log_fit <- glm(
binary_label ~ ., data = cv_train_data, family = "binomial"
)
log_preds <- predict(log_fit, cv_valid_data, type = "response")
# fit and evaluate lasso regression
lasso_fit <- glmnet::cv.glmnet(
x = as.matrix(cv_train_data |> dplyr::select(-binary_label)),
y = cv_train_data$binary_label,
family = "binomial",
alpha = 1,
foldid = as.numeric(as.factor(cv_train_data_all$block_id))
)
lasso_preds <- predict(
lasso_fit,
as.matrix(cv_valid_data |> dplyr::select(-binary_label)),
s = "lambda.min", # or "lambda.1se"
type = "response"
)
# fit and evaluate ridge regression
ridge_fit <- glmnet::cv.glmnet(
x = as.matrix(cv_train_data |> dplyr::select(-binary_label)),
y = cv_train_data$binary_label,
family = "binomial",
alpha = 0,
foldid = as.numeric(as.factor(cv_train_data_all$block_id))
)
ridge_preds <- predict(
ridge_fit,
as.matrix(cv_valid_data |> dplyr::select(-binary_label)),
s = "lambda.min", # or "lambda.1se"
type = "response"
)
# fit random forest
rf_fit <- ranger::ranger(
binary_label ~ ., data = cv_train_data,
probability = TRUE, verbose = FALSE
)
rf_preds <- predict(rf_fit, cv_valid_data)$predictions[, 2]
# evaluate predictions
preds_ls <- list(
"logistic" = log_preds,
"lasso" = lasso_preds,
"ridge" = ridge_preds,
"rf" = rf_preds
)
valid_auroc_ls[[fold]] <- purrr::map(
preds_ls,
~ yardstick::roc_auc_vec(
truth = cv_valid_data$binary_label,
estimate = c(.x),
event_level = "second"
)
) |>
dplyr::bind_rows(.id = "method")
valid_auprc_ls[[fold]] <- purrr::map(
preds_ls,
~ yardstick::pr_auc_vec(
truth = cv_valid_data$binary_label,
estimate = c(.x),
event_level = "second"
)
) |>
dplyr::bind_rows(.id = "method")
# save fold predictions for future investigation
valid_preds_ls[[fold]] <- cv_valid_data_all |>
dplyr::bind_cols(preds_ls)
}
# examine validation accuracy
valid_preds_ls2 <- valid_preds_ls
new_valid_auroc_df <- dplyr::bind_rows(valid_auroc_ls, .id = "fold") |>
dplyr::rename_with(~ sprintf("%s (new)", stringr::str_to_title(.x))) |>
dplyr::rename(fold = "Fold (new)") |>
dplyr::left_join(valid_auroc_df, by = "fold")
new_mean_valid_auroc_df <- new_valid_auroc_df |>
dplyr::summarise(dplyr::across(-fold, ~ mean(.x, na.rm = TRUE)))
new_valid_auprc_df <- dplyr::bind_rows(valid_auprc_ls, .id = "fold") |>
dplyr::rename_with(~ sprintf("%s (new)", stringr::str_to_title(.x))) |>
dplyr::rename(fold = "Fold (new)") |>
dplyr::left_join(valid_auprc_df, by = "fold")
new_mean_valid_auprc_df <- new_valid_auprc_df |>
dplyr::summarise(dplyr::across(-fold, ~ mean(.x, na.rm = TRUE)))
# evaluate best model on test set
train_data <- train_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
test_data <- test_data_all |>
dplyr::select(tidyselect::all_of(keep_vars))
best_fit <- ranger::ranger(
binary_label ~ ., data = train_data, probability = TRUE, verbose = FALSE
)
test_preds <- predict(best_fit, test_data)$predictions[, 2]
test_auroc <- yardstick::roc_auc_vec(
truth = test_data$binary_label,
estimate = test_preds,
event_level = "second"
)
test_auprc <- yardstick::pr_auc_vec(
truth = test_data$binary_label,
estimate = test_preds,
event_level = "second"
)
new_auroc_df <- new_mean_valid_auroc_df |>
dplyr::rename_with(~ sprintf("Validation %s AUROC", stringr::str_to_title(.x))) |>
dplyr::mutate(
`Test AUROC` = test_auroc
)
new_auprc_df <- new_mean_valid_auprc_df |>
dplyr::rename_with(~ sprintf("Validation %s AUPRC", stringr::str_to_title(.x))) |>
dplyr::mutate(
`Test AUPRC` = test_auprc
)
vthemes::pretty_kable(
new_valid_auroc_df, caption = "Fold-wise Validation AUROC for Various Models"
)| fold | Logistic (new) | Lasso (new) | Ridge (new) | Rf (new) | logistic | lasso | ridge | rf |
|---|---|---|---|---|---|---|---|---|
| 6 | 0.875 | 0.875 | 0.880 | 0.912 | 0.885 | 0.885 | 0.886 | 0.881 |
| 5 | 0.820 | 0.821 | 0.827 | 0.791 | 0.791 | 0.798 | 0.811 | 0.713 |
| 7 | 0.876 | 0.874 | 0.862 | 0.919 | 0.817 | 0.814 | 0.815 | 0.851 |
| 8 | 0.945 | 0.945 | 0.955 | 0.952 | 0.907 | 0.908 | 0.912 | 0.928 |
| 10 | 0.430 | 0.427 | 0.383 | 0.635 | 0.378 | 0.378 | 0.382 | 0.493 |
| 9 | 0.732 | 0.733 | 0.726 | 0.672 | 0.681 | 0.681 | 0.678 | 0.578 |
| 11 | 0.342 | 0.327 | 0.244 | 0.834 | 0.130 | 0.129 | 0.124 | 0.602 |
| 12 | 0.788 | 0.786 | 0.763 | 0.901 | 0.652 | 0.653 | 0.655 | 0.843 |
| Validation Logistic (New) AUROC | Validation Lasso (New) AUROC | Validation Ridge (New) AUROC | Validation Rf (New) AUROC | Validation Logistic AUROC | Validation Lasso AUROC | Validation Ridge AUROC | Validation Rf AUROC | Test AUROC |
|---|---|---|---|---|---|---|---|---|
| 0.726 | 0.724 | 0.705 | 0.827 | 0.655 | 0.656 | 0.658 | 0.736 | 0.821 |
vthemes::pretty_kable(
new_valid_auprc_df, caption = "Fold-wise Validation AUPRC for Various Models"
)| fold | Logistic (new) | Lasso (new) | Ridge (new) | Rf (new) | logistic | lasso | ridge | rf |
|---|---|---|---|---|---|---|---|---|
| 6 | 0.759 | 0.759 | 0.767 | 0.821 | 0.759 | 0.760 | 0.762 | 0.777 |
| 5 | 0.963 | 0.964 | 0.965 | 0.948 | 0.953 | 0.955 | 0.959 | 0.925 |
| 7 | 0.365 | 0.362 | 0.336 | 0.507 | 0.289 | 0.286 | 0.284 | 0.359 |
| 8 | 0.502 | 0.504 | 0.512 | 0.527 | 0.372 | 0.373 | 0.380 | 0.485 |
| 10 | 0.344 | 0.343 | 0.324 | 0.572 | 0.321 | 0.321 | 0.323 | 0.388 |
| 9 | 0.379 | 0.379 | 0.358 | 0.355 | 0.311 | 0.311 | 0.306 | 0.266 |
| 11 | 0.0168 | 0.0159 | 0.0132 | 0.0737 | 0.0117 | 0.0117 | 0.0116 | 0.0328 |
| 12 | 0.242 | 0.238 | 0.203 | 0.601 | 0.153 | 0.153 | 0.158 | 0.340 |
| Validation Logistic (New) AUPRC | Validation Lasso (New) AUPRC | Validation Ridge (New) AUPRC | Validation Rf (New) AUPRC | Validation Logistic AUPRC | Validation Lasso AUPRC | Validation Ridge AUPRC | Validation Rf AUPRC | Test AUPRC |
|---|---|---|---|---|---|---|---|---|
| 0.446 | 0.445 | 0.435 | 0.551 | 0.396 | 0.396 | 0.398 | 0.446 | 0.400 |
Let’s look at the held-out folds where the methods didn’t perform so well and compare the predictions across methods.
plot_vars <- c(
"binary_label", "logistic", "lasso", "ridge", "rf",
"DF", "CF", "BF", "AF", "AN", "NDAI", "SD", "CORR"
)
for (fold in unique(train_data_all$block_id)) {
cat(sprintf("\n\n#### Fold %s\n\n", fold))
preds_df <- valid_preds_ls[[fold]]
plt_ls <- list()
for (var in plot_vars) {
plt_ls[[var]] <- plot_cloud_data(preds_df, var)
}
plt <- patchwork::wrap_plots(plt_ls, ncol = 5)
vthemes::subchunkify(
plt, i = subchunk_idx, fig_width = 14, fig_height = 6
)
subchunk_idx <- subchunk_idx + 1
}